Group Project Wilson & Mills¶

In [2]:
#Libraries Used
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 
from sklearn.cluster import KMeans
import missingno as msno
import statsmodels.api as sm
import scipy.stats as stats
from statsmodels.formula.api import ols
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor
import plotly.express as px

Import Data and Review¶

In [4]:
#Import Dataset
original_df = pd.read_csv("US_Accidents_March23.csv")
original_df.columns.tolist
Out[4]:
<bound method IndexOpsMixin.tolist of Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')>
In [5]:
#Data Characteristics
original_df.head(5)
Out[5]:
ID Source Severity Start_Time End_Time Start_Lat Start_Lng End_Lat End_Lng Distance(mi) ... Roundabout Station Stop Traffic_Calming Traffic_Signal Turning_Loop Sunrise_Sunset Civil_Twilight Nautical_Twilight Astronomical_Twilight
0 A-1 Source2 3 2016-02-08 05:46:00 2016-02-08 11:00:00 39.865147 -84.058723 NaN NaN 0.01 ... False False False False False False Night Night Night Night
1 A-2 Source2 2 2016-02-08 06:07:59 2016-02-08 06:37:59 39.928059 -82.831184 NaN NaN 0.01 ... False False False False False False Night Night Night Day
2 A-3 Source2 2 2016-02-08 06:49:27 2016-02-08 07:19:27 39.063148 -84.032608 NaN NaN 0.01 ... False False False False True False Night Night Day Day
3 A-4 Source2 3 2016-02-08 07:23:34 2016-02-08 07:53:34 39.747753 -84.205582 NaN NaN 0.01 ... False False False False False False Night Day Day Day
4 A-5 Source2 2 2016-02-08 07:39:07 2016-02-08 08:09:07 39.627781 -84.188354 NaN NaN 0.01 ... False False False False True False Day Day Day Day

5 rows × 46 columns

In [6]:
#Data Characteristics
original_df.describe(include = 'all')
Out[6]:
ID Source Severity Start_Time End_Time Start_Lat Start_Lng End_Lat End_Lng Distance(mi) ... Roundabout Station Stop Traffic_Calming Traffic_Signal Turning_Loop Sunrise_Sunset Civil_Twilight Nautical_Twilight Astronomical_Twilight
count 7728394 7728394 7.728394e+06 7728394 7728394 7.728394e+06 7.728394e+06 4.325632e+06 4.325632e+06 7.728394e+06 ... 7728394 7728394 7728394 7728394 7728394 7728394 7705148 7705148 7705148 7705148
unique 7728394 3 NaN 6131796 6705355 NaN NaN NaN NaN NaN ... 2 2 2 2 2 1 2 2 2 2
top A-1 Source1 NaN 2021-01-26 16:16:13 2021-11-22 08:00:00 NaN NaN NaN NaN NaN ... False False False False False False Day Day Day Day
freq 1 4325632 NaN 225 112 NaN NaN NaN NaN NaN ... 7728145 7526493 7514023 7720796 6584622 7728394 5334553 5695619 6076156 6377548
mean NaN NaN 2.212384e+00 NaN NaN 3.620119e+01 -9.470255e+01 3.626183e+01 -9.572557e+01 5.618423e-01 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
std NaN NaN 4.875313e-01 NaN NaN 5.076079e+00 1.739176e+01 5.272905e+00 1.810793e+01 1.776811e+00 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min NaN NaN 1.000000e+00 NaN NaN 2.455480e+01 -1.246238e+02 2.456601e+01 -1.245457e+02 0.000000e+00 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25% NaN NaN 2.000000e+00 NaN NaN 3.339963e+01 -1.172194e+02 3.346207e+01 -1.177543e+02 0.000000e+00 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN NaN 2.000000e+00 NaN NaN 3.582397e+01 -8.776662e+01 3.618349e+01 -8.802789e+01 3.000000e-02 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN NaN 2.000000e+00 NaN NaN 4.008496e+01 -8.035368e+01 4.017892e+01 -8.024709e+01 4.640000e-01 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
max NaN NaN 4.000000e+00 NaN NaN 4.900220e+01 -6.711317e+01 4.907500e+01 -6.710924e+01 4.417500e+02 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

11 rows × 46 columns

In [7]:
#Data Characteristics
original_df.isna().sum()
Out[7]:
ID                             0
Source                         0
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                  3402762
End_Lng                  3402762
Distance(mi)                   0
Description                    5
Street                     10869
City                         253
County                         0
State                          0
Zipcode                     1915
Country                        0
Timezone                    7808
Airport_Code               22635
Weather_Timestamp         120228
Temperature(F)            163853
Wind_Chill(F)            1999019
Humidity(%)               174144
Pressure(in)              140679
Visibility(mi)            177098
Wind_Direction            175206
Wind_Speed(mph)           571233
Precipitation(in)        2203586
Weather_Condition         173459
Amenity                        0
Bump                           0
Crossing                       0
Give_Way                       0
Junction                       0
No_Exit                        0
Railway                        0
Roundabout                     0
Station                        0
Stop                           0
Traffic_Calming                0
Traffic_Signal                 0
Turning_Loop                   0
Sunrise_Sunset             23246
Civil_Twilight             23246
Nautical_Twilight          23246
Astronomical_Twilight      23246
dtype: int64

Clean Dataset for Experiment¶

In [9]:
#Remove fields to not be modeled and make values useable for multivariate analysis
GA_DF = original_df.loc[original_df['State'] == 'GA'].drop(columns =['Source',
                                                                     'ID',
                                                                     'Description',
                                                                     'State',
                                                                     'Street',
                                                                     'End_Lat',
                                                                     'End_Lng',
                                                                     'End_Time',
                                                                     'City',
                                                                     'County',
                                                                     'Country',
                                                                     'Timezone',
                                                                     'Zipcode',
                                                                     'Bump',
                                                                     'Weather_Condition',
                                                                     'Airport_Code',
                                                                     'Wind_Direction',
                                                                     'Weather_Timestamp',
                                                                     'Civil_Twilight',
                                                                     'Nautical_Twilight',
                                                                     'Traffic_Calming',
                                                                     'Roundabout',
                                                                     'Turning_Loop',
                                                                     'Astronomical_Twilight',
                                                                    'Sunrise_Sunset',
                                                                    'Start_Time'],
                                                           axis=1
                                                          ).rename(
    columns ={'Distance(mi)' : 'Distance'}
)
with pd.option_context('future.no_silent_downcasting', True):
    GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 1]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 1 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
In [10]:
#Find missingness relationship
msno.heatmap(GA_DF, cmap='YlGnBu')
Out[10]:
<Axes: >
No description has been provided for this image
In [11]:
#Remove Missing
GA_DF = GA_DF.dropna()
In [12]:
#Verify all missing values no longer present
msno.heatmap(GA_DF, cmap='YlGnBu')
C:\Users\Mills\Anaconda\Lib\site-packages\seaborn\matrix.py:309: UserWarning: Attempting to set identical low and high xlims makes transformation singular; automatically expanding.
  ax.set(xlim=(0, self.data.shape[1]), ylim=(0, self.data.shape[0]))
C:\Users\Mills\Anaconda\Lib\site-packages\seaborn\matrix.py:309: UserWarning: Attempting to set identical low and high ylims makes transformation singular; automatically expanding.
  ax.set(xlim=(0, self.data.shape[1]), ylim=(0, self.data.shape[0]))
Out[12]:
<Axes: >
No description has been provided for this image
In [13]:
#Experiment Dataset characteristics
GA_DF.describe(include = 'all')
Out[13]:
Severity Start_Lat Start_Lng Distance Temperature(F) Wind_Chill(F) Humidity(%) Pressure(in) Visibility(mi) Wind_Speed(mph) Precipitation(in) Amenity Crossing Give_Way Junction No_Exit Railway Station Stop Traffic_Signal
count 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000
mean 2.401534 33.565999 -83.969520 0.840086 63.821845 63.106559 70.141759 29.197364 8.913840 6.161196 0.009756 0.001583 0.040446 0.002815 0.089359 0.000455 0.005558 0.000880 0.006365 0.073700
std 0.636650 0.626898 0.927210 1.992995 15.332620 16.540394 21.879941 0.382641 2.458774 4.608659 0.053045 0.039762 0.197005 0.052983 0.285262 0.021335 0.074343 0.029647 0.079527 0.261283
min 1.000000 30.626320 -85.546465 0.000000 8.000000 -10.000000 9.000000 27.790000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 2.000000 33.510780 -84.437874 0.000000 52.000000 52.000000 52.000000 28.940000 10.000000 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 2.000000 33.744621 -84.334777 0.143000 65.000000 65.000000 74.000000 29.080000 10.000000 6.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 3.000000 33.881139 -84.048462 0.968000 75.000000 75.000000 90.000000 29.370000 10.000000 9.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 4.000000 34.992405 -80.852722 95.852000 140.000000 140.000000 100.000000 30.560000 12.000000 38.000000 1.850000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [14]:
#Experiment Dataset characteristics
GA_DF.head(20)
Out[14]:
Severity Start_Lat Start_Lng Distance Temperature(F) Wind_Chill(F) Humidity(%) Pressure(in) Visibility(mi) Wind_Speed(mph) Precipitation(in) Amenity Crossing Give_Way Junction No_Exit Railway Station Stop Traffic_Signal
146516 3 33.690125 -84.500153 0.01 45.0 40.6 86.0 30.16 5.0 8.1 0.09 0 0 0 0 0 0 0 0 0
146517 3 33.665138 -84.418549 0.01 45.0 38.9 93.0 30.18 3.0 12.7 0.06 0 0 0 0 0 0 0 0 0
146518 3 33.671810 -84.328018 0.01 45.0 38.9 93.0 30.18 3.0 12.7 0.06 0 0 0 0 0 0 0 0 0
146519 3 33.892895 -84.260452 0.01 44.1 40.8 89.0 30.17 3.0 5.8 0.02 0 0 0 0 0 0 0 0 0
146520 3 33.690125 -84.500153 0.01 45.0 40.6 86.0 30.16 5.0 8.1 0.09 0 0 0 0 0 0 0 0 0
146521 3 33.681328 -84.411522 0.01 45.0 37.6 100.0 30.12 5.0 17.3 0.10 0 0 0 0 0 0 0 0 0
146522 3 33.617954 -84.484985 0.01 45.0 37.6 100.0 30.12 5.0 17.3 0.10 0 0 0 0 0 0 0 0 0
146523 3 33.745052 -84.389732 0.01 45.0 37.6 100.0 30.12 5.0 17.3 0.10 0 0 0 0 0 0 0 0 0
146524 3 33.745556 -84.349213 0.01 45.0 37.6 100.0 30.12 5.0 17.3 0.10 0 0 0 0 0 0 0 0 0
146525 3 33.885162 -84.251266 0.01 44.1 40.1 89.0 30.15 7.0 6.9 0.06 0 0 0 0 0 0 0 0 0
146526 3 33.866615 -84.249062 0.01 44.1 40.1 89.0 30.15 7.0 6.9 0.06 0 0 0 0 0 0 0 0 0
146528 3 33.699013 -84.266167 0.01 45.0 38.5 100.0 30.14 5.0 13.8 0.10 0 0 0 0 0 0 0 0 0
146529 3 33.545197 -84.268410 0.01 45.0 38.9 93.0 30.14 4.0 12.7 0.12 0 0 0 0 0 0 0 0 0
146530 3 33.656929 -84.497757 2.37 45.0 38.9 93.0 30.14 4.0 12.7 0.12 0 0 0 0 0 0 0 0 0
146531 3 33.619576 -84.460335 0.01 45.0 38.9 93.0 30.14 4.0 12.7 0.12 0 0 0 0 0 0 0 0 0
146533 3 33.743549 -84.332092 0.01 45.0 38.9 93.0 30.14 4.0 12.7 0.12 0 0 0 0 0 0 0 0 0
146535 3 33.698853 -84.265923 1.67 44.1 37.4 96.0 30.15 2.0 13.8 0.08 0 0 0 0 0 0 0 0 0
146540 3 33.753635 -84.495628 0.01 45.0 40.6 86.0 30.16 3.0 8.1 0.02 0 0 0 0 0 0 0 0 0
146541 2 33.699425 -84.457336 0.52 44.1 37.4 96.0 30.14 3.0 13.8 0.17 0 0 0 0 0 0 0 0 0
146542 3 33.823265 -84.355835 0.01 44.1 40.8 89.0 30.17 4.0 5.8 0.04 0 0 0 0 0 0 0 0 0
In [15]:
#Location Overview
levels, categories = pd.factorize(sorted(GA_DF['Severity'], reverse = False)) 
scatter = plt.scatter(GA_DF['Start_Lng'],GA_DF['Start_Lat'], s=1, c=levels)
plt.legend(scatter.legend_elements()[0], categories, title='Severity')
plt.gca().set(xlabel='Longitude', ylabel='Distance of Road Impacted (mi)', title='Georgias Most Impactful Accidents')
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\2843138273.py:2: FutureWarning: factorize with argument that is not not a Series, Index, ExtensionArray, or np.ndarray is deprecated and will raise in a future version.
  levels, categories = pd.factorize(sorted(GA_DF['Severity'], reverse = False))
Out[15]:
[Text(0.5, 0, 'Longitude'),
 Text(0, 0.5, 'Distance of Road Impacted (mi)'),
 Text(0.5, 1.0, 'Georgias Most Impactful Accidents')]
No description has been provided for this image
In [16]:
#Create dependent and Independent variables
X = GA_DF.loc[:, ~GA_DF.columns.isin(['Severity','Distance'])]
Y = GA_DF[['Severity','Distance']]
In [17]:
#Independent variable characteristics
X.describe()
Out[17]:
Start_Lat Start_Lng Temperature(F) Wind_Chill(F) Humidity(%) Pressure(in) Visibility(mi) Wind_Speed(mph) Precipitation(in) Amenity Crossing Give_Way Junction No_Exit Railway Station Stop Traffic_Signal
count 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000 96622.000000
mean 33.565999 -83.969520 63.821845 63.106559 70.141759 29.197364 8.913840 6.161196 0.009756 0.001583 0.040446 0.002815 0.089359 0.000455 0.005558 0.000880 0.006365 0.073700
std 0.626898 0.927210 15.332620 16.540394 21.879941 0.382641 2.458774 4.608659 0.053045 0.039762 0.197005 0.052983 0.285262 0.021335 0.074343 0.029647 0.079527 0.261283
min 30.626320 -85.546465 8.000000 -10.000000 9.000000 27.790000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 33.510780 -84.437874 52.000000 52.000000 52.000000 28.940000 10.000000 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 33.744621 -84.334777 65.000000 65.000000 74.000000 29.080000 10.000000 6.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 33.881139 -84.048462 75.000000 75.000000 90.000000 29.370000 10.000000 9.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 34.992405 -80.852722 140.000000 140.000000 100.000000 30.560000 12.000000 38.000000 1.850000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

Assumption Tests¶

In [19]:
#Correlation map
sb.heatmap(X.corr(), vmax=1., square=True)
Out[19]:
<Axes: >
No description has been provided for this image
In [20]:
#All independents Q-Q Plot
fig = sm.qqplot(X, line='45')
plt.show()
No description has been provided for this image
In [21]:
#Test Multicollinearity
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                   for i in range(X.shape[1])]

vif_data
Out[21]:
feature VIF
0 Start_Lat 4841.754669
1 Start_Lng 8524.177561
2 Temperature(F) 2201.679062
3 Wind_Chill(F) 1865.221464
4 Humidity(%) 16.054051
5 Pressure(in) 2250.049006
6 Visibility(mi) 19.867626
7 Wind_Speed(mph) 3.735512
8 Precipitation(in) 1.189233
9 Amenity 1.009975
10 Crossing 1.576543
11 Give_Way 1.035104
12 Junction 1.109717
13 No_Exit 1.003336
14 Railway 1.052809
15 Station 1.016101
16 Stop 1.036660
17 Traffic_Signal 1.657900

Baseline Model¶

In [23]:
#Split test and train data
X_train, X_test, y_train, y_test = train_test_split( 
    X, Y, test_size=0.3, random_state=1) 
In [24]:
#Build basic multivariate model
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
regression_predictions = regr.predict(X_test) 
In [25]:
#MSE
mean_squared_error(y_test, regression_predictions) 
Out[25]:
2.176909428131628
In [26]:
#MAE
mean_absolute_error(y_test, regression_predictions) 
Out[26]:
0.7453151588620005
In [27]:
#R2
r2_score(y_test, regression_predictions) 
Out[27]:
0.0299620876438198

PCA¶

In [29]:
#Scaler Transformation
scaler = StandardScaler()
In [30]:
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
In [31]:
pca = PCA(.95).fit(X_train)
pca.n_components_
Out[31]:
15
In [32]:
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Factors')
plt.ylabel('Variance (%)')
plt.title('Pre-PCA Transformation Explained Variance')
plt.show()
No description has been provided for this image
In [33]:
#PCA Transformation
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
In [34]:
#PCA Model
logisticRegr = LogisticRegression(solver = 'lbfgs')
logisticRegr.fit(X_train,y_train['Severity'].to_numpy())
predictions = logisticRegr.predict(X_test) 
In [35]:
#MSE
mean_squared_error(y_test['Severity'], predictions) 
Out[35]:
0.5665298237140787
In [36]:
#MAE
mean_absolute_error(y_test['Severity'], predictions) 
Out[36]:
0.42301721461344743
In [37]:
#R2
r2_score(y_test['Severity'], predictions) 
Out[37]:
-0.39302769823280603
In [48]:
#Model coefficients
pca = PCA()
pca.fit(X)
explained_variance_ratio = pca.explained_variance_ratio_
explained_variance_ratio
Out[48]:
array([5.91402289e-01, 3.81855025e-01, 1.95532775e-02, 4.60423023e-03,
       1.10073051e-03, 1.02928631e-03, 2.06317484e-04, 8.50440565e-05,
       7.36743771e-05, 4.96977398e-05, 2.11041699e-05, 6.27070398e-06,
       5.16464128e-06, 2.63330694e-06, 2.40588890e-06, 1.55576579e-06,
       8.47049893e-07, 4.46516706e-07])
In [50]:
#Scree Plot
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.title('PCA Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.xticks(np.arange(1, len(explained_variance_ratio) + 1, 1))
plt.show()
No description has been provided for this image
In [52]:
#Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)')
plt.title('Explained Variance')
plt.show()
No description has been provided for this image
In [54]:
#Store components 
pca = PCA(n_components=3)
components = pca.fit_transform(X)
In [56]:
#MANOVA
data_pca = pd.concat([Y.reset_index(drop=True), pd.DataFrame(components,columns = ['PC1','PC2','PC3']).reset_index(drop=True)],axis=1)
formula = 'Severity + Distance ~ PC1 + PC2 +PC3'
manova_pca = MANOVA.from_formula(formula, data = data_pca)
results = manova_pca.mv_test()
print(results)
                     Multivariate linear model
====================================================================
                                                                    
--------------------------------------------------------------------
       Intercept         Value  Num DF   Den DF     F Value   Pr > F
--------------------------------------------------------------------
          Wilks' lambda  0.0634 2.0000 96617.0000 713623.0480 0.0000
         Pillai's trace  0.9366 2.0000 96617.0000 713623.0480 0.0000
 Hotelling-Lawley trace 14.7722 2.0000 96617.0000 713623.0480 0.0000
    Roy's greatest root 14.7722 2.0000 96617.0000 713623.0480 0.0000
--------------------------------------------------------------------
                                                                    
--------------------------------------------------------------------
              PC1           Value  Num DF   Den DF   F Value  Pr > F
--------------------------------------------------------------------
              Wilks' lambda 0.9968 2.0000 96617.0000 153.3970 0.0000
             Pillai's trace 0.0032 2.0000 96617.0000 153.3970 0.0000
     Hotelling-Lawley trace 0.0032 2.0000 96617.0000 153.3970 0.0000
        Roy's greatest root 0.0032 2.0000 96617.0000 153.3970 0.0000
--------------------------------------------------------------------
                                                                    
---------------------------------------------------------------------
           PC2            Value   Num DF    Den DF    F Value  Pr > F
---------------------------------------------------------------------
           Wilks' lambda  0.9987  2.0000  96617.0000  65.1561  0.0000
          Pillai's trace  0.0013  2.0000  96617.0000  65.1561  0.0000
  Hotelling-Lawley trace  0.0013  2.0000  96617.0000  65.1561  0.0000
     Roy's greatest root  0.0013  2.0000  96617.0000  65.1561  0.0000
--------------------------------------------------------------------
                                                                    
---------------------------------------------------------------------
           PC3            Value   Num DF    Den DF    F Value  Pr > F
---------------------------------------------------------------------
           Wilks' lambda  0.9998  2.0000  96617.0000  11.4135  0.0000
          Pillai's trace  0.0002  2.0000  96617.0000  11.4135  0.0000
  Hotelling-Lawley trace  0.0002  2.0000  96617.0000  11.4135  0.0000
     Roy's greatest root  0.0002  2.0000  96617.0000  11.4135  0.0000
====================================================================

In [58]:
#PC1 vs PC2
fig = px.scatter(components, x=0, y=1, color=GA_DF['Severity'],labels={
                     "0": "PC1",
                     "1": "PC2",
                     "color": "Severity"
                 },
                title="PCA Scatterplot (PC1 vs. PC2)")
fig.show()
In [59]:
#PC1 vs PC3
fig = px.scatter(components, x=0, y=2, color=GA_DF['Severity'],labels={
                     "0": "PC1",
                     "2": "PC3",
                     "color": "Severity"
                 },
                title="PCA Scatterplot (PC1 vs. PC3)")
fig.show()
In [61]:
#PC2 vs PC3
fig = px.scatter(components, x=1, y=2, color=GA_DF['Severity'],labels={
                     "1": "PC2",
                     "2": "PC3",
                     "color": "Severity"
                 },
                title="PCA Scatterplot (PC2 vs. PC3)")
fig.show()
In [62]:
# 3D Plot
total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=GA_DF['Severity'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()
In [63]:
#Loadings Plot
loadings = pca.components_  
for i in range(3):
    plt.plot(loadings[i], label=f'PC {i+1}', marker='o')

plt.title('Loading Plot')
plt.xlabel('Features')
plt.ylabel('Loading Value')
plt.ylim(-1, 1)
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [65]:
# Sum of loadings 
loadings[0]+loadings[1]+loadings[2]
Out[65]:
array([-7.34207344e-03,  1.58423167e-03,  2.19539299e-02, -1.17291688e-01,
        1.46574046e+00, -1.10696692e-02, -1.42066332e-01,  9.03878257e-01,
        2.26310643e-03, -1.08027636e-05, -4.29280601e-04, -2.72007040e-05,
        1.10087902e-03,  1.28605669e-05,  2.15818864e-05, -3.38563052e-05,
       -2.88299942e-04, -1.13891322e-03])
In [67]:
# Q-Q plots against normal distribution
%matplotlib tk
fig, axes = plt.subplots(6, 3, figsize=(15, 5), layout = 'constrained')
axes = axes.flatten()
for i, col in enumerate(X.columns):
    stats.probplot(GA_DF[col], dist="norm", plot=axes[i])
    axes[i].set_title(f"Q-Q Plot of {col}")
plt.show()

LASSO Model¶

In [73]:
#LASSO train and test scaler transform
LassoX_train = scaler.fit_transform(X_train)
LassoX_test = scaler.transform(X_test)
In [75]:
#Lasso model
model = Lasso()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = MultiTaskLassoCV(alphas=np.arange(0, 1, 0.01), cv=cv, n_jobs=-1).fit(LassoX_train, y_train)
print('alpha: %f' % model.alpha_)
C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: UserWarning:

Coordinate descent with l1_reg=0 may lead to unexpected results and is discouraged.

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 131935.87954872157, tolerance: 26.818999514559525

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 135542.32259107713, tolerance: 27.543266845558065

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 131321.56251747793, tolerance: 26.69405847159424

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 134038.86828425183, tolerance: 27.23777233103883

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 126686.78579102649, tolerance: 25.764757318567447

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 125841.04588310918, tolerance: 25.59711961904939

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 135811.37739642186, tolerance: 27.59570762476094

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 134501.771932224, tolerance: 27.332988521505126

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 132562.1790970095, tolerance: 26.947903828166414

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 135460.82148482272, tolerance: 27.530134415960905

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 132519.93085208983, tolerance: 26.940069947787528

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 128314.55720768594, tolerance: 26.092418209251296

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 127696.63589309895, tolerance: 25.971860454500035

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 130727.47302150655, tolerance: 26.578294114195913

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 135783.62956220293, tolerance: 27.59398324902429

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 134326.5283312325, tolerance: 27.297875403834265

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 124319.30969121488, tolerance: 25.29158977840956

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 132890.42456852185, tolerance: 27.01240581898566

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 125846.97074779686, tolerance: 25.5947860975266

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 132990.51679790937, tolerance: 27.025420399330663

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 126789.84782092765, tolerance: 25.792506464047918

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 126005.54226547966, tolerance: 25.630746942442663

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 130140.40727755838, tolerance: 26.454039466686712

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 127436.04607645723, tolerance: 25.917250174170572

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 132859.06248942617, tolerance: 27.005384722123807

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 124236.74305945201, tolerance: 25.28099105235222

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 128376.38093085267, tolerance: 26.103525850683134

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: UserWarning:

Coordinate descent with l1_reg=0 may lead to unexpected results and is discouraged.

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 125294.03780777332, tolerance: 25.486902933789388

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 130910.63845642326, tolerance: 26.61761739496349

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 126840.30866713515, tolerance: 25.79445105932087

C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:2559: UserWarning:

Coordinate descent with l1_reg=0 may lead to unexpected results and is discouraged.

alpha: 0.000000
C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:2559: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations. Duality gap: 144743.29109283234, tolerance: 29.427627354713856

In [78]:
#Prediction to test MSE et all
predictions = model.predict(LassoX_test) 
In [80]:
#MSE
mean_squared_error(y_test, predictions)
Out[80]:
2.1847991973425933
In [82]:
#MAE
mean_absolute_error(y_test, predictions) 
Out[82]:
0.7488280364373681
In [84]:
#R2
r2_score(y_test, predictions) 
Out[84]:
0.02204012759790447

All of United States for Comparison¶

In [86]:
#Apply same data cleaning as GA dataset
US_DF = original_df.loc[original_df['State'] != 'GA'].dropna().drop(columns =['Source',
                                                                     'ID',
                                                                     'Description',
                                                                     'State',
                                                                     'Street',
                                                                     'End_Lat',
                                                                     'End_Lng',
                                                                     'End_Time',
                                                                     'City',
                                                                     'County',
                                                                     'Country',
                                                                     'Timezone',
                                                                     'Zipcode',
                                                                     'Bump',
                                                                     'Weather_Condition',
                                                                     'Airport_Code',
                                                                     'Wind_Direction',
                                                                     'Weather_Timestamp',
                                                                     'Civil_Twilight',
                                                                     'Nautical_Twilight',
                                                                     'Traffic_Calming',
                                                                     'Roundabout',
                                                                     'Turning_Loop',
                                                                     'Astronomical_Twilight',
                                                                    'Sunrise_Sunset',
                                                                    'Start_Time'],
                                                           axis=1
                                                          ).rename(
    columns ={'Distance(mi)' : 'Distance'}
)
with pd.option_context('future.no_silent_downcasting', True):
    US_DF.loc[:,'Amenity':'Traffic_Signal'] = US_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.

C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.

C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.

C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 1 ... 1 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.

C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.

C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.

C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.

C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.

C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.

In [88]:
#Split test and training
X_all = US_DF.loc[:, ~US_DF.columns.isin(['Severity','Distance'])]
Y_all = US_DF[['Severity','Distance']]
In [90]:
#Predictions to test MSE et all
all_predictions = regr.predict(X_all) 
In [92]:
#MSE
mean_squared_error(Y_all, all_predictions)
Out[92]:
7.551554730006793
In [94]:
#MAE
mean_absolute_error(Y_all, all_predictions) 
Out[94]:
2.1411786002666657
In [96]:
#R2
r2_score(Y_all, all_predictions) 
Out[96]:
-27.553697665435195